Source Data: US domestic flights from 1990 to 2009, US Census Bureau
| Airport Code | City | Percent Seats Filled |
|---|---|---|
| HNL | Honolulu, HI | 83.11 |
| PBI | West Palm Beach, FL | 82.03 |
| MCO | Orlando, FL | 81.64 |
| MIA | Miami, FL | 81.42 |
| SFO | San Francisco, CA | 81.10 |
| FLL | Fort Lauderdale, FL | 80.57 |
| TPA | Tampa, FL | 80.38 |
| SEA | Seattle, WA | 80.25 |
| LAS | Las Vegas, NV | 80.20 |
| IAH | Houston, TX | 79.48 |
| Origin | Destination | Percent Seats Filled |
|---|---|---|
| Nashville, TN | Seattle, WA | 95.03 |
| Springfield, MO | Las Vegas, NV | 94.86 |
| Seattle, WA | Charlotte, NC | 94.43 |
| All Routes | Median: | 75.28 |
If Houston - Dallas had average rate of seats filled, it would have saved around 104,000 empty seats in 2009
---
title: "2009 Domestic Flight Analysis"
date: "2/27/2020"
output:
flexdashboard::flex_dashboard:
orientation: rows
vertical_layout: fill
source: embed
theme: yeti
---
``` {r echo = FALSE}
# Course: 5210 Communicating Data
# Purpose: General Mills Cereal Sales Analysis
# Date: 2/13/20
# Author: Eric Hestekin, Schyuler Lujan
# Repo: https://github.com/ehestekin/MSBA_5210_DataVis
```
``` {r echo = FALSE, include = FALSE}
# Clear environment of variables and functions
rm(list = ls(all = TRUE))
# Clear environmet of packages
if(is.null(sessionInfo()$otherPkgs) == FALSE)lapply(paste("package:", names(sessionInfo()$otherPkgs), sep=""), detach, character.only = TRUE, unload = TRUE)
```
``` {r setup, include = FALSE, warning = FALSE}
# Load Library
library(tidyverse)
library(patchwork)
library(scales)
#this forces kable html outputs (avoids format = 'html' in each call)
options(knitr.table.format = "html")
```
Source Data: [US domestic flights from 1990 to 2009, US Census Bureau](http://academictorrents.com/details/a2ccf94bbb4af222bf8e69dad60a68a29f310d9a)
```{r, warning = FALSE}
#Load Data TSV
#no column headers so set array of names
col_labels = c('origin_code',
'dest_code',
'origin_city',
'dest_city',
'passengers',
'seats',
'num_flights',
'distance',
'year_month',
'origin_pop',
'dest_pop')
flights_df <- read_tsv('flight_edges.tsv',col_names = col_labels)
```
```{r, include = FALSE}
#Tidy
#separate month_year
#first 4 chars are year, convert from string to num
flights_df <- flights_df %>%
separate(year_month, into = c('year','month'), sep = 4, convert = TRUE)
#add percentage of seats filled for each flight
#and remove rows where seats are zero (no flight data)
# and also remove flights where seat/flight < 4 (non commercial flight)
com_flights_df <- flights_df %>%
mutate(seat_fill_perc = (passengers/seats) * 100) %>%
filter(seats != 0) %>%
filter(seats/num_flights > 4)
#also throw out seat filled percentages > 100
#this shouldn't be possible so don't trust data point
com_flights_df <- filter(com_flights_df, seat_fill_perc <= 100)
#get summary now with tidy commercial flights data set
summary(com_flights_df)
```
Actions
----------
### Seattle is a good candidate for capacity increase {data-width=83}
+ Seattle is ***2nd most full*** west coast airport (behind SFO)
+ 8th most full in US
```{r, include = F}
com_flights_by_city <- com_flights_df %>%
group_by(origin_code, year) %>%
summarize(tot_passenger_departs = sum(passengers),
origin_pop = mean(origin_pop), #still want origin pop data. mean should just return the pop
avg_seat_fill = sum(passengers)/sum(seats) * 100,
pass_pop_ratio = tot_passenger_departs / origin_pop) %>%
left_join(distinct(select(com_flights_df, origin_code, origin_city)), by = 'origin_code')
#limit to cities with significant departures (>100 passengers daily)
com_flights_by_city <- com_flights_by_city %>% filter(tot_passenger_departs > 36500)
com_flights_by_city_2009 <- com_flights_by_city %>% filter(year == 2009)
summary(com_flights_by_city)
```
```{r}
com_flights_by_city_2009 %>%
arrange(desc(avg_seat_fill)) %>%
filter(tot_passenger_departs > 1e6,
avg_seat_fill > 79.475) %>%
select(origin_code, origin_city, avg_seat_fill) %>%
knitr::kable(col.names = c('Airport Code',
'City',
'Percent Seats Filled'),
align = 'c',
digits = 2) %>%
kableExtra::row_spec(8, bold = T,
color = '#a3a3a3', background = '#344182')
```
```{r, include = F}
route_data <- com_flights_df %>%
group_by(origin_city, dest_city, year) %>%
summarise(total_flights = sum(num_flights),
total_passengers = sum(passengers),
total_seats = sum(seats),
avg_seat_fill_perc = mean(total_passengers / total_seats) * 100)
#lets look at 5 most popular routes from last year and see how it has trended over the previous 10 years
#only look at routes with more than a flight every other day
route_data_2009 <- route_data %>% filter(year == 2009, total_flights > 365/2)
```
### Specific Routes {data-width=83}
+ Seattle had two of the top three most-filled routes
```{r}
route_data_2009 %>%
arrange(desc(avg_seat_fill_perc)) %>%
filter(avg_seat_fill_perc > 94.4) %>%
select(origin_city, dest_city, avg_seat_fill_perc) %>%
ungroup() %>%
add_row(origin_city = 'All Routes',
dest_city = 'Median:',
avg_seat_fill_perc = 75.284) %>%
knitr::kable(col.names = c('Origin',
'Destination',
'Percent Seats Filled'),
align = rep('c', 3),
digits = 2) %>%
kableExtra::row_spec(c(1,3), bold = T,
color = '#a3a3a3', background = '#344182') %>%
kableExtra::kable_styling(full_width = T) %>%
kableExtra::row_spec(4, bold = T)
# summary(route_data_2009)
```
### Most popular routes, one stands out as needing improvement {data-width=117}
```{r}
five_busiest_routes_data <- route_data_2009 %>%
arrange(desc(total_passengers)) %>%
filter(total_passengers %in%
#manually select rows for now had a hard time filtering properly
c(1501883, 1494141, 1399554, 1380928, 1352360)) %>%
mutate(route = paste(origin_city, ' - ', dest_city))
five_busiest_plot <-
ggplot(five_busiest_routes_data,
mapping = aes(x = route,
y = avg_seat_fill_perc/100)) +
geom_bar(stat = 'identity',
fill = c('grey','#750000','grey','grey','grey')) +
geom_text(aes(x = route, y = avg_seat_fill_perc/100,
label = scales::percent(avg_seat_fill_perc/100,
accuracy = 1),
fontface = c('plain','plain','plain','bold','plain')),
position = position_dodge(width = 0.9),
hjust = 1) +
coord_flip() + theme_classic() +
theme(axis.line.y = element_blank(),
axis.line.x = element_blank(),
axis.title.y = element_blank(),
plot.title = element_text(color = "#808080",
size = 16,
face = "bold", hjust = 2.5),
plot.subtitle = element_text(color = "#808080",
size = 12,
face = "plain", hjust = -1),
axis.text.x = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_text(color = "#808080", size = 11,
face = c('plain','bold','plain','plain','plain')),
legend.title = element_text(color = "#808080", size = 12, face = "plain"),
legend.text = element_text(color = "#808080", size = 12, face = "plain"),
plot.caption = element_text(color = "#808080", size = 10, face = "plain", margin = margin(t=10))
) +
scale_y_continuous(labels = percent_format(accuracy = 1)) +
labs(title = 'Houston to Dallas flights are too empty',
subtitle = 'Top 5 routes by passenger volume') +
ylab('Percentage of Seats Filled')
five_busiest_plot
```
### Significant savings potential {data-width=50}
If Houston - Dallas had *average* rate of seats filled, it would have saved around **104,000 empty seats** in 2009
Overall market data row
-----------------------
### Overall passenger volume has decreased recently
```{r}
pop_trend_data <- com_flights_df %>%
group_by(year) %>%
summarise(tot_passenger_vol = sum(passengers))
pop_trend_plot <- ggplot(pop_trend_data, mapping = aes(x = year, y = tot_passenger_vol/1e6)) +
geom_line(color = 'grey') +
geom_line(filter(pop_trend_data, year > 2006),
mapping = aes(x = year, y = tot_passenger_vol/1e6),
size = 1.5,
color = '#750000')
pop_trend_plot <- pop_trend_plot +
geom_point(data = filter(pop_trend_data, year == 2009),
mapping = aes(x = year, y = tot_passenger_vol/1e6),
fill = '#344182', shape = 25, size = 3) +
geom_text(aes(x = 2010, y = tot_passenger_vol[year == 2009]/1e6 - 10), label = '526.3M', color = '#344182' )
#get percent decrease over last few years
# (pop_trend_data$tot_passenger_vol[20] - pop_trend_data$tot_passenger_vol[18]) /
# pop_trend_data$tot_passenger_vol[18] * 100
pop_trend_plot + theme_classic() +
theme(axis.line.y = element_blank(),
axis.line.x = element_blank(),
axis.title.x = element_blank(),
plot.title = element_text(color = "#808080",
size = 16, face = "bold"),
plot.subtitle = element_text(color = "#808080",
size = 12, face = "plain"),
axis.text.x = element_text(color = "#808080", size = 11),
axis.text.y = element_text(color = "#808080", size = 11),
axis.title = element_text(color = "#808080", size = 11, face = "plain"),
legend.title = element_text(color = "#808080", size = 12, face = "plain"),
legend.text = element_text(color = "#808080", size = 12, face = "plain"),
plot.caption = element_text(color = "#808080", size = 10, face = "plain", margin = margin(t=10))
) +
scale_x_continuous(breaks = seq(1990,2010, by = 6)) +
labs(title = 'Total passenger volume is falling',
subtitle = expression(paste('Down ',
italic(bold('9.1%')),
' from 2007 peak')),
y = 'Millions of Passengers')
```
### Despite volume drop, operating efficiency is up
```{r}
filled_trend_data <- com_flights_df %>%
group_by(year) %>%
summarise(avg_seat_fill_perc = mean(seat_fill_perc))
filled_trend_plot <- ggplot(filled_trend_data,
mapping = aes(x = year,
y = avg_seat_fill_perc/100)) +
geom_line(color = 'grey') +
geom_line(filter(filled_trend_data, year > 2006),
mapping = aes(x = year, y = avg_seat_fill_perc/100),
size = 1.5,
color = '#344182')
filled_trend_plot <- filled_trend_plot +
geom_point(data = filter(filled_trend_data, year == 2009),
mapping = aes(x = year, y = avg_seat_fill_perc/100),
fill = '#344182', shape = 25, size = 3) +
geom_text(aes(x = 2010, y = avg_seat_fill_perc[year == 2009]/100 - 0.01), label = '71%', color = '#344182' )
filled_trend_plot + theme_classic() +
theme(axis.line.y = element_blank(),
axis.line.x = element_blank(),
axis.title = element_blank(),
plot.title = element_text(color = "#808080",
size = 16, face = "bold"),
plot.subtitle = element_text(color = "#808080",
size = 12, face = "plain"),
axis.text.x = element_text(color = "#808080", size = 11),
axis.text.y = element_text(color = "#808080", size = 11),
legend.title = element_text(color = "#808080", size = 12, face = "plain"),
legend.text = element_text(color = "#808080", size = 12, face = "plain"),
plot.caption = element_text(color = "#808080", size = 10, face = "plain", margin = margin(t=10))
) +
scale_x_continuous(breaks = seq(1990,2010, by = 6)) +
scale_y_continuous(labels = percent_format()) +
labs(title = 'Percentage of seats filled has gone up',
subtitle = 'Up 1 percentage point from 2007 total passenger peak')
```